Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.
Run GO Term enrichment analysis using results from DGE analysis
(i.e., after running the 1a-DGE.Rmd script). Red text indicates regions that require the
user to modify. Regardless, the user should check over all code blocks
to ensure that everything is running correctly.
Load packages
library(goseq)
library(tidyverse)
library(GSEABase)
library(data.table)
library(ggplot2)
library(cowplot)
library(patchwork)
Change file names and conditions where appropriate.
# Input unfiltered data
treatmentinfo.file <- "salmon.numreads.DGE_samples.tsv"
dge_results.file <- "salmon.numreads.DGE_results.tsv"
annotation.file <- "Pocillopora_acuta_KBHIv2.pep.GOs.tsv"
# Output DEG results
GO_term_sig.file <- "salmon.numreads.DGE_results.GOsig.tsv"
KEGG_sig.file <- "salmon.numreads.DGE_results.KEGGsig.tsv"
# Cutoff for significant GO term p-values
GO_enrich_pvalue.cutoff <- 0.05
KEGG_enrich_pvalue.cutoff <- 0.05
# KEGG ID description file - provided with script
KEGG_IDs_Descriptions.file <- "1b-KEGG_IDs_Descriptions.tsv"
Load the input file containing the treatment information.
treatmentinfo <- read.csv(treatmentinfo.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM") # Read in file
head(treatmentinfo)
## sample_id unit lib_type fq1 fq2 species plug_id
## 1 Pacuta_ATAC_TP8_1051 1 pe SRR14610912 SRR14610912 Pacuta 1051
## 2 Pacuta_ATAC_TP8_1755 1 pe SRR14610911 SRR14610911 Pacuta 1755
## 3 Pacuta_ATAC_TP8_2012 1 pe SRR14610910 SRR14610910 Pacuta 2012
## 4 Pacuta_HTAC_TP8_1329 1 pe SRR14611019 SRR14611019 Pacuta 1329
## 5 Pacuta_HTAC_TP8_1765 1 pe SRR14611018 SRR14611018 Pacuta 1765
## 6 Pacuta_HTAC_TP8_2513 1 pe SRR14611017 SRR14611017 Pacuta 2513
## treatment timepoint reef ploidy group
## 1 ATAC 8 Reef.11.13 3 Group1
## 2 ATAC 8 Reef.42.43 2 Group5
## 3 ATAC 8 Reef.42.43 3 Group2
## 4 HTAC 8 Lilipuna.Fringe 2 Group6
## 5 HTAC 8 Reef.42.43 3 Group3
## 6 HTAC 8 Reef.18 3 Group2
# Check we have the right column names
headers <- c("sample_id")
if( all(headers %in% colnames(treatmentinfo)) ){
print(paste(treatmentinfo.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(treatmentinfo.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.DGE_samples.tsv has the required columns: sample_id"
Load the input file containing DGE results.
dge_results <- as.data.frame(read.csv(dge_results.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM")) # Read in file
# Check we have the right column names
headers <- c("gene_id")
if( all(headers %in% colnames(dge_results)) ){
print(paste(dge_results.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(dge_results.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.DGE_results.tsv has the required columns: gene_id"
Load the input file containing gene annotations (GO terms, KEGG IDs, and gene/protein lengths).
# Annotations
annot <- read.csv(annotation.file, header=TRUE, sep="\t", fileEncoding="UTF-8-BOM")
# Check we have the right column names
headers <- c("gene_id", "GO_IDs", "KEGG_IDs", "length")
if( all(headers %in% colnames(annot)) ){
paste(paste(annotation.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(annotation.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "Pocillopora_acuta_KBHIv2.pep.GOs.tsv has the required columns: gene_id, GO_IDs, KEGG_IDs, length"
# Select only relevant information
Go.ref <- subset(annot, select= c(gene_id, length))
# Merge dge_results by available annotations
Go.ref <- merge(dge_results, Go.ref, by = "gene_id")
Go.ref <- unique(Go.ref)
dim(Go.ref)
## [1] 20 8
Set ID and gene length vectors, and make a binary matrix indicating which genes are differentially expressed. These are used as input to nullp, which for calculates a Probability Weighting Function for DEGs.
# Make ID and length vectors
IDvector <- annot$gene_id
lengthVector <- annot$length
# Get binary list indicating which genes are in DGE results and which are not out of all genes
dge.genes <- as.integer(annot$gene_id %in% Go.ref$gene_id)
names(dge.genes) <- annot$gene_id
print(paste("Number of DGE genes: ", length(dge.genes[dge.genes == 1]), sep=''))
## [1] "Number of DGE genes: 20"
print(paste("Number of NON DGE genes: ", length(dge.genes[dge.genes == 0]), sep=''))
## [1] "Number of NON DGE genes: 24892"
# Weight vector by length of gene
pwf <- nullp(DEgenes=dge.genes, id=IDvector, bias.data=lengthVector)
## Warning in pcls(G): initial point very close to some inequality constraints
Prepare GO term dataframe.
# Cleanup GO terms and split multiple GO terms per line, into one GO term per line
GO.annot <- annot %>%
subset(select=c(gene_id, GO_IDs)) %>%
drop_na() %>% # Remove rows with missing GO terms or gene IDs
filter(GO_IDs!="-") # Remove rows with missing values indicated by '-'
splitted <- strsplit(as.character(GO.annot$GO_IDs), "; ") # Split into multiple GO ids
GO.terms <- data.frame(v1 = rep.int(GO.annot$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their GO terms in a single row
colnames(GO.terms) <- c("gene_id", "GO.ID")
# Cleanup single-line GO term dataframe
GO.terms <- GO.terms %>%
mutate(GO.ID = gsub(" ", "", GO.ID)) %>% # Remove spaces from GO terms - in case any exist by mistake
mutate(GO.ID = as.character(GO.ID)) %>%
mutate(GO.ID = factor(GO.ID, levels=unique(GO.ID))) %>%
mutate(gene_id = factor(gene_id, levels=unique(gene_id))) %>%
unique()
# Print stats
print(paste("No rows in 'GO.terms': ", dim(GO.terms)[1], sep=''))
## [1] "No rows in 'GO.terms': 1655777"
print(paste("Avg GO IDs per gene: ", nrow(GO.terms) / length(unique(GO.terms$gene_id)), sep=''))
## [1] "Avg GO IDs per gene: 142.26110490592"
head(GO.terms)
## gene_id GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001101
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001932
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001933
## 4 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002009
## 5 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002020
## 6 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002165
Find enriched GO terms, “selection-unbiased testing for category enrichment amongst significantly expressed genes for RNA-seq data”.
GOall <- goseq(pwf, GOref$gene_id, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
Find only enriched GO terms that are statistically significant at cutoff.
GOall.filtered <- GOall %>%
filter(over_represented_pvalue < GO_enrich_pvalue.cutoff | under_represented_pvalue < GO_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff,
"Over",
if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff,
"Under",
"NULL"
)
)
)
# Print stats
print(paste("Number GOs BEFORE sig filtering: ", nrow(GOall), sep=''))
## [1] "Number GOs BEFORE sig filtering: 21041"
print(paste("Number GOs AFTER sig filtering: ", nrow(GOall.filtered), sep=''))
## [1] "Number GOs AFTER sig filtering: 81"
print(paste("Number GOs AFTER sig filtering OVER: ", nrow(GOall.filtered %>% filter(over_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering OVER: 81"
print(paste("Number GOs AFTER sig filtering UNDER: ", nrow(GOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering UNDER: 0"
print(paste("Number sig BP terms: ", nrow(filter(GOall.filtered, ontology=="BP")), sep=''))
## [1] "Number sig BP terms: 53"
print(paste("Number sig MF terms: ", nrow(filter(GOall.filtered, ontology=="MF")), sep=''))
## [1] "Number sig MF terms: 10"
print(paste("Number sig CC terms: ", nrow(filter(GOall.filtered, ontology=="CC")), sep=''))
## [1] "Number sig CC terms: 13"
Correct any un-annotated terms/ontologies.
NAs.ontology <- GOall.filtered %>% subset(is.na(term))
NAs.ontology
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 77 GO:0044214 0.01175946 0.9999387 1
## 78 GO:0089717 0.01175946 0.9999387 1
## 79 GO:0008329 0.01418905 0.9999095 1
## 80 GO:1901483 0.03456946 0.9994421 1
## 81 GO:1901485 0.03456946 0.9994421 1
## numInCat term ontology dir
## 77 14 <NA> <NA> Over
## 78 14 <NA> <NA> Over
## 79 17 <NA> <NA> Over
## 80 34 <NA> <NA> Over
## 81 34 <NA> <NA> Over
Save significant terms.
write.table(GOall.filtered, file = GO_term_sig.file, row.names=F, quote=F, sep='\t')
Run GOslim to get broader categories.
slim <- getOBOCollection("http://current.geneontology.org/ontology/subsets/goslim_generic.obo") # Get GO database
## BP
BP_GO <- GOall.filtered %>%
filter(ontology=="BP")
BPGO_collection <- GOCollection(BP_GO$category) # Make library of query terms
slims_bp <- data.frame(goSlim(BPGO_collection, slim, "BP")) # Find common parent terms to slim down our list
slims_bp$category <- row.names(slims_bp) # Save rownames as category
## MF
MF_GO <- GOall.filtered %>%
filter(ontology=="MF")
MFGO_collection <- GOCollection(MF_GO$category) # Make library of query terms
slims_mf <- data.frame(goSlim(MFGO_collection, slim, "MF")) # Find common parent terms to slim down our list
slims_mf$category <- row.names(slims_mf) # Save rownames as category
Get mapped terms, using functions from Sam White’s Biostars post.
# Write function mappedIds to get the query terms that mapped to the slim categories
mappedIds <-function(df, collection, OFFSPRING) { # The command to run requires a dataframe of slim terms, like slims_MF above, your list of query terms, and the offspring from the GOCollection by goSlim
map <- as.list(OFFSPRING[rownames(df)]) # Subset GOcollection offspring by the rownames of your dataframe
mapped <- lapply(map, intersect, ids(collection)) # Find the terms that intersect between the subset made above of your query terms and the GOids from the GO collection
df[["go_terms"]] <- vapply(unname(mapped), paste, collapse = ";", character(1L)) # Add column "go_terms" with matching terms
df # Show resulting dataframe
}
# Run function for MF and BP terms
BPslim <- mappedIds(slims_bp, BPGO_collection, GOBPOFFSPRING)
MFslim <- mappedIds(slims_mf, MFGO_collection, GOMFOFFSPRING)
Remove duplicate matches, keeping the broader umbrella term
# BP
BPslim <- filter(BPslim, Count>0 & Term!="biological_process") # Filter out empty slims and term "biological process"
BPsplitted <- strsplit(as.character(BPslim$go_terms), ";") # Split into multiple GO ids
BPslimX <- data.frame(Term = rep.int(BPslim$Term, sapply(BPsplitted, length)), go_term = unlist(BPsplitted)) # List all
BPslimX <- merge(BPslimX, BPslim[,c(1,3:4)], by="Term") # Add back counts, term, and category info
BPslimX <- unique(setDT(BPslimX)[order(go_term, -Count)], by = "go_term") # Remove duplicate offspring terms, keeping only those that appear in the larger umbrella term (larger Count number)
BPslim <- data.frame(slim_term=BPslimX$Term, slim_cat=BPslimX$category, category=BPslimX$go_term) # Rename columns
head(BPslim)
## slim_term slim_cat category
## 1 membrane organization GO:0061024 GO:0006911
## 2 immune system process GO:0002376 GO:0006955
## 3 chromatin organization GO:0006325 GO:0007549
## 4 signaling GO:0023052 GO:0007603
## 5 signaling GO:0023052 GO:0008377
## 6 chromatin organization GO:0006325 GO:0009047
# MF
MFslim <- filter(MFslim, Count>0 & Term!="molecular_function") # Filter out empty slims and term "molecular function"
MFsplitted <- strsplit(as.character(MFslim$go_terms), ";") # Split into multiple GO ids
MFslimX <- data.frame(Term = rep.int(MFslim$Term, sapply(MFsplitted, length)), go_term = unlist(MFsplitted)) # List all
MFslimX <- merge(MFslimX, MFslim[,c(1,3:4)], by="Term") # Add back counts, term, and category info
MFslimX <- unique(setDT(MFslimX)[order(go_term, -Count)], by = "go_term") # Remove duplicate offspring terms, keeping only
MFslim <- data.frame(slim_term=MFslimX$Term, slim_cat=MFslimX$category, category=MFslimX$go_term) # Rename columns
head(MFslim)
## slim_term slim_cat category
## 1 lipid binding GO:0008289 GO:0001530
## 2 cargo receptor activity GO:0038024 GO:0005044
## 3 transporter activity GO:0005215 GO:0010461
## 4 molecular function regulator activity GO:0098772 GO:0038177
## 5 molecular transducer activity GO:0060089 GO:0038187
Add back GO enrichment info for each offspring term.
GO.BP <- right_join(BPslim, filter(GOall.filtered, ontology=="BP"), by="category")
GO.MF <- right_join(MFslim, filter(GOall.filtered, ontology=="MF"), by="category")
Plot heatmap of BP and MF GO slim terms.
term_label_text_size <- 6
slim_label_text_size <- 6
BPplot <- GO.BP %>%
filter(numInCat>5) %>%
mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
ggplot(aes(x = dir, y = term)) +
geom_tile(aes(fill=over_represented_pvalue, width = 1)) +
facet_grid(slim_term ~ ontology, scales = "free_y", labeller = label_wrap_gen(width = 10, multi_line = TRUE))+
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = slim_label_text_size, face = "bold"),
strip.text.x = element_text(size = 12, face = "bold"),
axis.title.x = element_blank(),
axis.title.y = element_text(size=15),
axis.text = element_text(size = term_label_text_size), legend.position = "None",
plot.margin = unit(c(0,1,0,0.25), "cm")
); BPplot
MFplot <- GO.MF %>%
filter(numInCat>5) %>%
mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
ggplot(aes(x = dir, y = term)) +
geom_tile(aes(fill=over_represented_pvalue, width = 1)) +
scale_y_discrete(position = "right") +
facet_grid(slim_term ~ ontology,
scales = "free_y",
labeller = label_wrap_gen(width = 10, multi_line = TRUE),
switch="y" # Put the y facet strips on the left
) +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
strip.text.y.left = element_text(angle=0, size = slim_label_text_size, face = "bold"),
strip.text.x = element_text(size = 12, face = "bold"),
axis.title = element_blank(),
axis.text = element_text(size = term_label_text_size),
legend.title = element_text(size = 12),
legend.text = element_text(size = 11)
); MFplot
Combined BP and MF plots.
BPplot + MFplot + plot_annotation(tag_levels = "A", tag_suffix = ")") & theme(plot.tag = element_text(size=15, face="bold"))
Select KEGG Orthogroup IDs (KOs) from annotation dataframe.
# Extract gene_ids and KO from annotation file
KO.terms <- annot %>%
subset(select=c(gene_id, KEGG_IDs)) %>% # Select columns
drop_na() %>% # Remove rows with missing GO terms or gene IDs
filter(KEGG_IDs!="-") # Remove rows with missing values indicated by '-'
# Split multiple KEGG IDs per line into multiple lines
splitted <- strsplit(as.character(KO.terms$KEGG_IDs), "; ") # Split into multiple KEGG IDs
KO.terms <- data.frame(v1 = rep.int(KO.terms$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their KEGG IDs in a single row
colnames(KO.terms) <- c("gene_id", "KEGG_IDs")
KO.terms <- unique(KO.terms)
colnames(KO.terms) <- c("gene_id", "GO.ID")
head(KO.terms)
## gene_id GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1a K00549
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b K16866
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g18333.t1 K03386
## 4 Pocillopora_acuta_KBHIv2___RNAseq.g7985.t1 K12418
## 5 Pocillopora_acuta_KBHIv2___TS.g15308.t1 K13755
## 6 Pocillopora_acuta_KBHIv2___RNAseq.g2057.t1 K11001
# Bind KO and GO references
GOKO.terms <- bind_rows(GO.terms, KO.terms)
Perform Kegg enrichment with goseq package.
KOall <- goseq(pwf, GOref$gene_id, gene2cat=GOKO.terms, test.cats=c("KEGG"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
Extract significantly enriched KEGG terms.
KOall.filtered <- KOall %>%
filter(over_represented_pvalue < KEGG_enrich_pvalue.cutoff | under_represented_pvalue < KEGG_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff,
"Over",
if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff,
"Under",
"NULL"
)
)
) %>%
mutate(ontology = replace_na(ontology, "KEGG")) %>%
filter(ontology=="KEGG")
# Print stats
print(paste("Number KEGG IDs BEFORE sig filtering: ", nrow(KOall), sep=''))
## [1] "Number KEGG IDs BEFORE sig filtering: 28185"
print(paste("Number KEGG IDs AFTER sig filtering: ", nrow(KOall.filtered), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering: 17"
print(paste("Number KEGG IDs AFTER sig filtering OVER: ", nrow(KOall.filtered %>% filter(over_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering OVER: 17"
print(paste("Number KEGG IDs AFTER sig filtering UNDER: ", nrow(KOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering UNDER: 0"
Add KO definitions.
# Load definition file
KEGG_IDs_Descriptions <- read.table(KEGG_IDs_Descriptions.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM", quote='') # Read in file
# Prep definition data
KEGG_IDs_Descriptions <- KEGG_IDs_Descriptions %>%
subset(select=c("D.ID", "D.Description")) %>%
unique()
colnames(KEGG_IDs_Descriptions) <- c("category", "description")
# Merge with KEGG output
KOall.filtered.descriptions <- unique(left_join(KOall.filtered, KEGG_IDs_Descriptions, by=c("category")))
Write output KEGG enrichment files.
write.table(KOall.filtered.descriptions, file = KEGG_sig.file, row.names=F, quote=F, sep='\t')